Cory Whitney
"2019-03-19"
Andrew MacDonald @polesasunder
Keep it tidy
Use ‘#’ to annotate and not run
If not Rmarkdown then at least use ‘—-’ or ‘####’
#Section 1—-
#Section 2####
#Section 3####
TOC in upper right console
Keep it tidy
# The easiest way to get libraries for today is to install the whole tidyverse:
#install.packages("tidyverse")
library(tidyverse)
browseVignettes(package = "tidyverse")
The tidy tools manifesto
Keep it tidy
Keep it tidy
H. Wickham, Tidy Data, Journal of Statistical Software, 59(10)
Three things make a dataset tidy:
Format of dplyr
Using dplyr
library(dplyr)
genomicsclass.github.io/book/pages/dplyr_tutorial
Roger Peng
Subsetting
Select
aca_work_filter<-select(participants_data, academic_parents, working_hours_per_day)
academic_parents working_hours_per_day
1 N 8
2 Y 8
3 N 7
4 Y 16
5 N 12
6 N 16
non_aca_work_filter<-select(participants_data, -academic_parents, -working_hours_per_day)
batch age gender continent_of_origin research_continent
1 2017 33 F Europe Europe
2 2017 31 F South America South America
3 2017 30 F South America South America
4 2017 28 M Africa Africa
5 2017 30 M Africa Africa
6 2017 30 M Africa Africa
number_of_publications number_of_siblings km_home_to_zef years_of_study
1 6 1 1.7 20
2 0 2 40.0 9
3 1 2 10370.0 7
4 1 1 15.0 3
5 3 5 6.0 7
6 3 4 8.3 10
days_to_email_response letters_in_first_name
1 1 4
2 1 6
3 1 7
4 1 7
5 1 6
6 2 6
Subsetting
Filter
work_filter<-filter(participants_data, working_hours_per_day >10)
work_name_filter<-filter(participants_data, working_hours_per_day >10 & letters_in_first_name >6)
work_name_filter
batch age gender continent_of_origin research_continent
1 2017 28 M Africa Africa
number_of_publications working_hours_per_day number_of_siblings
1 1 16 1
academic_parents km_home_to_zef years_of_study days_to_email_response
1 Y 15 3 1
letters_in_first_name
1 7
Rename
participants_data <- rename(participants_data, name_length = letters_in_first_name)
participants_data <- rename(participants_data,
daily_labor = working_hours_per_day)
names(participants_data)
[1] "batch" "age"
[3] "gender" "continent_of_origin"
[5] "research_continent" "number_of_publications"
[7] "daily_labor" "number_of_siblings"
[9] "academic_parents" "km_home_to_zef"
[11] "years_of_study" "days_to_email_response"
[13] "name_length"
Mutate
participants_data <- mutate(participants_data, daily_labor*mean(daily_labor, na.rm=T))
# head(participants_data)
participants_data <- mutate(participants_data, number_of_siblings -mean(number_of_siblings))
head(participants_data)
batch age gender continent_of_origin research_continent
1 2017 33 F Europe Europe
2 2017 31 F South America South America
3 2017 30 F South America South America
4 2017 28 M Africa Africa
5 2017 30 M Africa Africa
6 2017 30 M Africa Africa
number_of_publications daily_labor number_of_siblings academic_parents
1 6 8 1 N
2 0 8 2 Y
3 1 7 2 N
4 1 16 1 Y
5 3 12 5 N
6 3 16 4 N
km_home_to_zef years_of_study days_to_email_response name_length
1 1.7 20 1 4
2 40.0 9 1 6
3 10370.0 7 1 7
4 15.0 3 1 7
5 6.0 7 1 6
6 8.3 10 2 6
daily_labor * mean(daily_labor, na.rm = T)
1 70.5000
2 70.5000
3 61.6875
4 141.0000
5 105.7500
6 141.0000
number_of_siblings - mean(number_of_siblings)
1 -2.375
2 -1.375
3 -1.375
4 -2.375
5 1.625
6 0.625
Group by create a commute category
participants_data <- mutate(participants_data, commute = factor(1* (km_home_to_zef > 10), labels = c("commuter", "local")), na.rm=T)
new_participants_data <- drop_na(participants_data)
commuter_data <- group_by(new_participants_data, commute)
commuter_data
# A tibble: 15 x 17
# Groups: commute [2]
batch age gender continent_of_or… research_contin… number_of_publi…
* <int> <int> <fct> <fct> <fct> <int>
1 2017 33 F Europe Europe 6
2 2017 31 F South America South America 0
3 2017 30 F South America South America 1
4 2017 28 M Africa Africa 1
5 2017 30 M Africa Africa 3
6 2017 30 M Africa Africa 3
7 2017 33 F Africa Africa 0
8 2017 30 F South America South America 2
9 2017 27 M Europe Europe 0
10 2018 32 F China China 2
11 2018 32 F Asia Asia 2
12 2018 36 M Africa Africa 10
13 2018 43 F North America North America 2
14 2018 32 M Asia Asia 2
15 2018 28 M Asia Asia 0
# … with 11 more variables: daily_labor <int>, number_of_siblings <int>,
# academic_parents <fct>, km_home_to_zef <dbl>, years_of_study <int>,
# days_to_email_response <int>, name_length <int>, `daily_labor *
# mean(daily_labor, na.rm = T)` <dbl>, `number_of_siblings -
# mean(number_of_siblings)` <dbl>, commute <fct>, na.rm <lgl>
Summarize get a summary of travel times and days to response
commuter_summary <- summarize(commuter_data, mean(days_to_email_response), median(name_length))
commuter_summary
# A tibble: 2 x 3
commute `mean(days_to_email_response)` `median(name_length)`
<fct> <dbl> <dbl>
1 commuter 1.55 5
2 local 1.25 7
Pipeline %>%
pipe_data <- participants_data %>%
mutate(commute = factor(1* (km_home_to_zef > 10),
labels = c("commuter", "local"))) %>%
drop_na() %>%
group_by(commute) %>%
summarize(mean(days_to_email_response), median(name_length),
max(years_of_study)) %>%
as.data.frame
pipe_data
commute mean(days_to_email_response) median(name_length)
1 commuter 1.545455 5
2 local 1.250000 7
max(years_of_study)
1 20
2 9
dplyr and magrittrlibrary(purrr)
purr Cheatsheet
Use purrr to solve: split a data frame into pieces, fit a model to each piece, compute the summary, then extract the R2.
library(purrr)
purrr_regression <- mtcars %>%
split(.$cyl) %>% # from base R
map(~ lm(mpg ~ wt, data = .)) %>%
map(summary) %>%
map_dbl("r.squared")
purrr_regression
4 6 8
0.5086326 0.4645102 0.4229655
If there is time